Wykorzystane biblioteki:
library(readxl)
library(dplyr)
library(tidyr)
library(ggplot2)
library(DT)
library(lattice)
library(plotly)
library(lubridate)
library(tibble)
library(rmarkdown)
library(zoo)
library(ggcorrplot)
library(caret)Poniższy blok kodu wczytuje dane:
setwd("D:\\studia\\ZED\\projekt\\Data pack\\")
goldPrice <- as_tibble(read.csv(file = "Gold prices.csv"))
currencyExchangeRates <- as_tibble(read.csv(file = "CurrencyExchangeRates.csv"))
spComposite <- as_tibble(read.csv(file = "S&P Composite.csv"))
worldDevelopmentIndicators <- as_tibble(read_excel("World_Development_Indicators.xlsx"))Poniższy blok kodu wczytuje dane odnośnie bitcoina:
setwd("D:\\studia\\ZED\\projekt\\Data pack\\Bitcoin")
bchain_metadata <- read.csv(file = "BCHAIN_metadata.csv")
bchain_mkpru <- read.csv(file = "BCHAIN-MKPRU.csv")Podsumowanie surowych danych.
summary(goldPrice)## Date USD..AM. USD..PM. GBP..AM.
## Length:13585 Min. : 34.77 Min. : 34.75 Min. : 14.48
## Class :character 1st Qu.: 280.50 1st Qu.: 281.50 1st Qu.: 177.71
## Mode :character Median : 383.32 Median : 383.50 Median : 234.51
## Mean : 575.20 Mean : 576.62 Mean : 370.84
## 3rd Qu.: 841.94 3rd Qu.: 851.50 3rd Qu.: 454.32
## Max. :2061.50 Max. :2067.15 Max. :1574.37
## NA's :1 NA's :143 NA's :11
## GBP..PM. EURO..AM. EURO..PM.
## Min. : 14.48 Min. : 237.3 Min. : 236.7
## 1st Qu.: 178.23 1st Qu.: 335.3 1st Qu.: 335.2
## Median : 234.96 Median : 892.6 Median : 896.1
## Mean : 371.81 Mean : 797.3 Mean : 797.2
## 3rd Qu.: 456.43 3rd Qu.:1114.1 3rd Qu.:1114.9
## Max. :1569.59 Max. :1743.8 Max. :1743.4
## NA's :154 NA's :7837 NA's :7880
Do dalszej analizy użyto cen złota podanej w dolarach, ponieważ miała ona najmniej nieustalonych wartości. Dane zostały zmodyfikowane, aby osiągnąć pojedynczą cenę złota na konkretny dzień. Wymagało to obliczania średniej z dwóch kolumn (ceny AM oraz PM, w przypadku braku jednej z nich brana jest dostępna wartość).
gp<- goldPrice %>%
mutate(Date=as.Date(Date,format="%Y-%m-%d")) %>%
mutate(usd=
ifelse(is.na(USD..AM.), USD..PM.,
ifelse(is.na(USD..PM.), USD..AM.,
round((USD..AM.+USD..PM.)/2.0,digits=2)
)
),
gbp=
ifelse(is.na(GBP..AM.), GBP..PM.,
ifelse(is.na(GBP..PM.), GBP..AM.,
round((GBP..AM.+GBP..PM.)/2.0,digits=2)
)
),
euro=
ifelse(is.na(EURO..AM.), EURO..PM.,
ifelse(is.na(EURO..PM.), EURO..AM.,
round((EURO..AM.+EURO..PM.)/2.0,digits=2)
)
)
) %>%
rename(g_date=Date, g_usd=usd, g_gbp=gbp,g_euro=euro) %>%
select(g_date,g_usd,g_gbp,g_euro)
summary(gp)## g_date g_usd g_gbp g_euro
## Min. :1968-01-02 Min. : 34.76 Min. : 14.48 Min. : 237.0
## 1st Qu.:1981-06-10 1st Qu.: 280.28 1st Qu.: 177.71 1st Qu.: 335.2
## Median :1994-11-14 Median : 383.38 Median : 234.51 Median : 894.7
## Mean :1994-11-16 Mean : 575.07 Mean : 370.78 Mean : 797.3
## 3rd Qu.:2008-04-23 3rd Qu.: 841.00 3rd Qu.: 454.80 3rd Qu.:1114.7
## Max. :2021-09-29 Max. :2058.15 Max. :1566.94 Max. :1736.2
## NA's :11 NA's :7837
gg<- ggplot(data=gp, aes(g_date)) +
geom_line(aes(y = g_usd, colour = "g_usd")) +
geom_line(aes(y = g_euro, colour = "g_euro")) +
geom_line(aes(y = g_gbp, colour = "g_gbp"))
ggplotly(gg)Z powyższego wykresu można spostrzec, że ceny złota w różnych walutach zachowują się podobnie.
Poniżej znajduje się krótkie podsumowanie wczytanych kursów walut.
colnames(currencyExchangeRates)## [1] "Date" "Algerian.Dinar"
## [3] "Australian.Dollar" "Bahrain.Dinar"
## [5] "Bolivar.Fuerte" "Botswana.Pula"
## [7] "Brazilian.Real" "Brunei.Dollar"
## [9] "Canadian.Dollar" "Chilean.Peso"
## [11] "Chinese.Yuan" "Colombian.Peso"
## [13] "Czech.Koruna" "Danish.Krone"
## [15] "Euro" "Hungarian.Forint"
## [17] "Icelandic.Krona" "Indian.Rupee"
## [19] "Indonesian.Rupiah" "Iranian.Rial"
## [21] "Israeli.New.Sheqel" "Japanese.Yen"
## [23] "Kazakhstani.Tenge" "Korean.Won"
## [25] "Kuwaiti.Dinar" "Libyan.Dinar"
## [27] "Malaysian.Ringgit" "Mauritian.Rupee"
## [29] "Mexican.Peso" "Nepalese.Rupee"
## [31] "New.Zealand.Dollar" "Norwegian.Krone"
## [33] "Nuevo.Sol" "Pakistani.Rupee"
## [35] "Peso.Uruguayo" "Philippine.Peso"
## [37] "Polish.Zloty" "Qatar.Riyal"
## [39] "Rial.Omani" "Russian.Ruble"
## [41] "Saudi.Arabian.Riyal" "Singapore.Dollar"
## [43] "South.African.Rand" "Sri.Lanka.Rupee"
## [45] "Swedish.Krona" "Swiss.Franc"
## [47] "Thai.Baht" "Trinidad.And.Tobago.Dollar"
## [49] "Tunisian.Dinar" "U.A.E..Dirham"
## [51] "U.K..Pound.Sterling" "U.S..Dollar"
nrow(currencyExchangeRates)## [1] 5978
summary(currencyExchangeRates)## Date Algerian.Dinar Australian.Dollar Bahrain.Dinar
## Length:5978 Min. : 71.29 Min. :0.4833 Min. :0.376
## Class :character 1st Qu.: 77.50 1st Qu.:0.6654 1st Qu.:0.376
## Mode :character Median : 81.28 Median :0.7595 Median :0.376
## Mean : 90.59 Mean :0.7683 Mean :0.376
## 3rd Qu.:108.88 3rd Qu.:0.8689 3rd Qu.:0.376
## Max. :115.58 Max. :1.1055 Max. :0.376
## NA's :4112 NA's :263 NA's :69
## Bolivar.Fuerte Botswana.Pula Brazilian.Real Brunei.Dollar
## Min. : 2.14 Min. :0.0855 Min. :0.832 Min. :1.000
## 1st Qu.: 2.59 1st Qu.:0.1197 1st Qu.:1.709 1st Qu.:1.348
## Median : 6.28 Median :0.1528 Median :2.048 Median :1.468
## Mean : 835.09 Mean :0.1965 Mean :2.161 Mean :1.508
## 3rd Qu.: 6.28 3rd Qu.:0.1844 3rd Qu.:2.794 3rd Qu.:1.698
## Max. :68827.50 Max. :4.8414 Max. :4.195 Max. :1.851
## NA's :3664 NA's :1275 NA's :539 NA's :1246
## Canadian.Dollar Chilean.Peso Chinese.Yuan Colombian.Peso
## Min. :0.917 Min. :377.5 Min. :6.093 Min. : 833.2
## 1st Qu.:1.086 1st Qu.:503.5 1st Qu.:6.495 1st Qu.:1786.0
## Median :1.297 Median :538.6 Median :6.989 Median :2017.6
## Mean :1.268 Mean :561.8 Mean :7.316 Mean :2073.1
## 3rd Qu.:1.409 3rd Qu.:619.8 3rd Qu.:8.277 3rd Qu.:2482.9
## Max. :1.613 Max. :758.2 Max. :8.746 Max. :3434.9
## NA's :356 NA's :1220 NA's :1316 NA's :582
## Czech.Koruna Danish.Krone Euro Hungarian.Forint
## Min. :14.45 Min. :4.665 Min. :0.8252 Min. :144.1
## 1st Qu.:19.35 1st Qu.:5.612 1st Qu.:1.0889 1st Qu.:202.7
## Median :21.88 Median :6.051 Median :1.2295 Median :224.3
## Mean :22.95 Mean :6.281 Mean :1.2076 Mean :231.1
## 3rd Qu.:24.94 3rd Qu.:6.805 3rd Qu.:1.3338 3rd Qu.:267.6
## Max. :40.29 Max. :9.006 Max. :1.5990 Max. :318.7
## NA's :1850 NA's :251 NA's :1070 NA's :1415
## Icelandic.Krona Indian.Rupee Indonesian.Rupiah Iranian.Rial
## Min. : 54.72 Min. :31.37 Min. : 2201 Min. : 1699
## 1st Qu.: 70.28 1st Qu.:42.82 1st Qu.: 8855 1st Qu.: 1755
## Median : 83.48 Median :45.92 Median : 9260 Median : 8992
## Mean : 92.46 Mean :48.02 Mean : 9144 Mean :10718
## 3rd Qu.:117.15 3rd Qu.:52.33 3rd Qu.:11380 3rd Qu.:11180
## Max. :147.98 Max. :68.78 Max. :14850 Max. :42000
## NA's :354 NA's :429 NA's :1492 NA's :1312
## Israeli.New.Sheqel Japanese.Yen Kazakhstani.Tenge Korean.Won
## Min. :3.230 Min. : 75.86 Min. :117.2 Min. : 756
## 1st Qu.:3.676 1st Qu.:100.70 1st Qu.:145.4 1st Qu.:1013
## Median :3.882 Median :109.39 Median :150.3 Median :1122
## Mean :4.003 Mean :107.97 Mean :185.6 Mean :1100
## 3rd Qu.:4.370 3rd Qu.:118.38 3rd Qu.:185.7 3rd Qu.:1186
## Max. :4.994 Max. :147.00 Max. :383.9 Max. :1965
## NA's :1939 NA's :316 NA's :3051 NA's :601
## Kuwaiti.Dinar Libyan.Dinar Malaysian.Ringgit Mauritian.Rupee
## Min. :0.2646 Min. :0.525 Min. :2.436 Min. :25.15
## 1st Qu.:0.2854 1st Qu.:0.662 1st Qu.:3.188 1st Qu.:29.12
## Median :0.2947 Median :1.932 Median :3.676 Median :30.67
## Mean :0.2936 Mean :1.510 Mean :3.508 Mean :31.03
## 3rd Qu.:0.3027 3rd Qu.:1.932 3rd Qu.:3.800 3rd Qu.:32.89
## Max. :0.3089 Max. :1.932 Max. :4.725 Max. :36.50
## NA's :1054 NA's :123 NA's :301 NA's :2460
## Mexican.Peso Nepalese.Rupee New.Zealand.Dollar Norwegian.Krone
## Min. : 5.915 Min. : 49.88 Min. :0.3927 Min. :4.959
## 1st Qu.:10.953 1st Qu.: 68.33 1st Qu.:0.5813 1st Qu.:6.104
## Median :12.680 Median : 74.04 Median :0.6844 Median :6.709
## Mean :13.116 Mean : 77.37 Mean :0.6606 Mean :6.965
## 3rd Qu.:13.668 3rd Qu.: 86.80 3rd Qu.:0.7364 3rd Qu.:7.806
## Max. :21.908 Max. :109.98 Max. :0.8822 Max. :9.606
## NA's :2266 NA's :479 NA's :310 NA's :291
## Nuevo.Sol Pakistani.Rupee Peso.Uruguayo Philippine.Peso
## Min. :2.539 Min. : 30.88 Min. : 9.32 Min. :24.55
## 1st Qu.:2.755 1st Qu.: 51.79 1st Qu.:20.07 1st Qu.:43.18
## Median :2.819 Median : 60.75 Median :22.94 Median :44.40
## Mean :2.960 Mean : 70.24 Mean :24.11 Mean :45.01
## 3rd Qu.:3.243 3rd Qu.: 94.29 3rd Qu.:28.44 3rd Qu.:47.10
## Max. :3.522 Max. :115.70 Max. :32.53 Max. :52.35
## NA's :4297 NA's :488 NA's :4287 NA's :4198
## Polish.Zloty Qatar.Riyal Rial.Omani Russian.Ruble
## Min. :2.022 Min. :3.64 Min. :0.3845 Min. :23.13
## 1st Qu.:3.033 1st Qu.:3.64 1st Qu.:0.3845 1st Qu.:28.27
## Median :3.290 Median :3.64 Median :0.3845 Median :30.54
## Mean :3.365 Mean :3.64 Mean :0.3845 Mean :36.91
## 3rd Qu.:3.822 3rd Qu.:3.64 3rd Qu.:0.3845 3rd Qu.:36.20
## Max. :4.500 Max. :3.64 Max. :0.3845 Max. :83.59
## NA's :1765 NA's :47 NA's :56 NA's :2435
## Saudi.Arabian.Riyal Singapore.Dollar South.African.Rand Sri.Lanka.Rupee
## Min. :3.745 Min. :1.201 Min. : 3.530 Min. : 49.57
## 1st Qu.:3.745 1st Qu.:1.361 1st Qu.: 6.213 1st Qu.: 77.54
## Median :3.750 Median :1.444 Median : 7.480 Median :103.99
## Mean :3.749 Mean :1.503 Mean : 8.113 Mean :102.19
## 3rd Qu.:3.750 3rd Qu.:1.687 3rd Qu.: 9.995 3rd Qu.:126.29
## Max. :3.750 Max. :1.851 Max. :16.771 Max. :157.65
## NA's :46 NA's :259 NA's :535 NA's :509
## Swedish.Krona Swiss.Franc Thai.Baht Trinidad.And.Tobago.Dollar
## Min. : 5.843 Min. :0.7253 Min. :24.44 Min. :5.839
## 1st Qu.: 6.838 1st Qu.:0.9777 1st Qu.:31.50 1st Qu.:6.260
## Median : 7.618 Median :1.1878 Median :34.65 Median :6.282
## Mean : 7.741 Mean :1.2090 Mean :35.14 Mean :6.310
## 3rd Qu.: 8.384 3rd Qu.:1.3903 3rd Qu.:39.45 3rd Qu.:6.382
## Max. :10.995 Max. :1.8228 Max. :56.06 Max. :6.789
## NA's :349 NA's :239 NA's :565 NA's :657
## Tunisian.Dinar U.A.E..Dirham U.K..Pound.Sterling U.S..Dollar
## Min. :1.342 Min. :3.671 Min. :1.213 Min. :1
## 1st Qu.:1.566 1st Qu.:3.672 1st Qu.:1.519 1st Qu.:1
## Median :1.723 Median :3.672 Median :1.599 Median :1
## Mean :1.850 Mean :3.672 Mean :1.615 Mean :1
## 3rd Qu.:2.157 3rd Qu.:3.672 3rd Qu.:1.676 3rd Qu.:1
## Max. :2.509 Max. :3.675 Max. :2.102 Max. :1
## NA's :4258 NA's :71 NA's :122
Poniższa komórka odpowiedzialna jest za rozpłaszenie danych w celu ułatwienia operowania na danych.
cer <- currencyExchangeRates %>%
mutate(Date=as.Date(Date,format="%Y-%m-%d"))%>%
gather(key="currency", value="value", 2:52) %>%
filter(!is.na(value))
summary(cer)## Date currency value
## Min. :1995-01-02 Length:243689 Min. : 0.09
## 1st Qu.:2002-03-01 Class :character 1st Qu.: 1.44
## Median :2008-01-10 Mode :character Median : 5.65
## Mean :2007-08-01 Mean : 485.89
## 3rd Qu.:2013-04-12 3rd Qu.: 57.11
## Max. :2018-05-02 Max. :68827.50
Prezentacja wybranych wartości
Poniższy kod prezentuje podsumowanie surowych danych. Można zauważyć, że jest w nich niewielka ilość brakujących wartości. W związku, z czym uzupełniono je danymi z wartościami z pomiaru poprzedniego dnia w przypadku i ich braku z dnia następnego. Nie usuwano wierszy, ponieważ brakujących wartości nie było dużo, a najbliższa wartość może oddawać najbardziej zbliżony stan.
spComposite <- spComposite %>%
mutate(Year=as.Date(Year,format="%Y-%m-%d")) %>%
arrange(Year)
summary(spComposite)## Year S.P.Composite Dividend Earnings
## Min. :1871-01-31 Min. : 2.730 Min. : 0.1800 Min. : 0.1600
## 1st Qu.:1908-10-07 1st Qu.: 7.902 1st Qu.: 0.4202 1st Qu.: 0.5608
## Median :1946-06-15 Median : 17.370 Median : 0.8717 Median : 1.4625
## Mean :1946-06-15 Mean : 327.968 Mean : 6.7321 Mean : 15.3714
## 3rd Qu.:1984-02-21 3rd Qu.: 164.400 3rd Qu.: 7.0525 3rd Qu.: 14.7258
## Max. :2021-10-31 Max. :4493.280 Max. :59.6800 Max. :158.7400
## NA's :4 NA's :4
## CPI Long.Interest.Rate Real.Price Real.Dividend
## Min. : 6.28 Min. : 0.620 Min. : 73.9 Min. : 5.445
## 1st Qu.: 10.20 1st Qu.: 3.171 1st Qu.: 186.6 1st Qu.: 9.417
## Median : 20.35 Median : 3.815 Median : 283.3 Median :14.411
## Mean : 62.39 Mean : 4.504 Mean : 622.0 Mean :17.498
## 3rd Qu.:102.28 3rd Qu.: 5.139 3rd Qu.: 707.0 3rd Qu.:22.301
## Max. :273.98 Max. :15.320 Max. :4477.2 Max. :63.511
## NA's :4
## Real.Earnings Cyclically.Adjusted.PE.Ratio
## Min. : 4.576 Min. : 4.784
## 1st Qu.: 14.063 1st Qu.:11.898
## Median : 23.524 Median :16.381
## Mean : 34.907 Mean :17.215
## 3rd Qu.: 43.768 3rd Qu.:20.913
## Max. :159.504 Max. :44.198
## NA's :4 NA's :120
head(spComposite)## # A tibble: 6 x 10
## Year S.P.Composite Dividend Earnings CPI Long.Interest.Rate Real.Price
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1871-01-31 4.44 0.26 0.4 12.5 5.32 97.3
## 2 1871-02-28 4.5 0.26 0.4 12.8 5.32 95.6
## 3 1871-03-31 4.61 0.26 0.4 13.0 5.33 96.6
## 4 1871-04-30 4.74 0.26 0.4 12.6 5.33 103.
## 5 1871-05-31 4.86 0.26 0.4 12.3 5.33 108.
## 6 1871-06-30 4.82 0.26 0.4 12.1 5.34 109.
## # ... with 3 more variables: Real.Dividend <dbl>, Real.Earnings <dbl>,
## # Cyclically.Adjusted.PE.Ratio <dbl>
count(spComposite)## # A tibble: 1 x 1
## n
## <int>
## 1 1810
spComposite <- spComposite %>% fill(names(.),.direction="updown")
summary(spComposite)## Year S.P.Composite Dividend Earnings
## Min. :1871-01-31 Min. : 2.730 Min. : 0.1800 Min. : 0.1600
## 1st Qu.:1908-10-07 1st Qu.: 7.902 1st Qu.: 0.4210 1st Qu.: 0.5637
## Median :1946-06-15 Median : 17.370 Median : 0.8833 Median : 1.4760
## Mean :1946-06-15 Mean : 327.968 Mean : 6.8451 Mean : 15.6882
## 3rd Qu.:1984-02-21 3rd Qu.: 164.400 3rd Qu.: 7.1425 3rd Qu.: 14.7525
## Max. :2021-10-31 Max. :4493.280 Max. :59.6800 Max. :158.7400
## CPI Long.Interest.Rate Real.Price Real.Dividend
## Min. : 6.28 Min. : 0.620 Min. : 73.9 Min. : 5.445
## 1st Qu.: 10.20 1st Qu.: 3.171 1st Qu.: 186.6 1st Qu.: 9.423
## Median : 20.35 Median : 3.815 Median : 283.3 Median :14.418
## Mean : 62.39 Mean : 4.504 Mean : 622.0 Mean :17.588
## 3rd Qu.:102.28 3rd Qu.: 5.139 3rd Qu.: 707.0 3rd Qu.:22.363
## Max. :273.98 Max. :15.320 Max. :4477.2 Max. :63.511
## Real.Earnings Cyclically.Adjusted.PE.Ratio
## Min. : 4.576 Min. : 4.784
## 1st Qu.: 14.074 1st Qu.:12.227
## Median : 23.546 Median :16.871
## Mean : 35.182 Mean :17.298
## 3rd Qu.: 43.819 3rd Qu.:20.478
## Max. :159.504 Max. :44.198
spComposite <- spComposite%>%
mutate(month = format(Year, "%m"), year = format(Year, "%Y"))%>%
select(-c('Year'))
head(spComposite)## # A tibble: 6 x 11
## S.P.Composite Dividend Earnings CPI Long.Interest.Rate Real.Price
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 4.44 0.26 0.4 12.5 5.32 97.3
## 2 4.5 0.26 0.4 12.8 5.32 95.6
## 3 4.61 0.26 0.4 13.0 5.33 96.6
## 4 4.74 0.26 0.4 12.6 5.33 103.
## 5 4.86 0.26 0.4 12.3 5.33 108.
## 6 4.82 0.26 0.4 12.1 5.34 109.
## # ... with 5 more variables: Real.Dividend <dbl>, Real.Earnings <dbl>,
## # Cyclically.Adjusted.PE.Ratio <dbl>, month <chr>, year <chr>
Poniżej znajduje się podsumowanie danych dotyczących światowych wskaźników rozwoju. Analiza ich wymagała zmiany struktury danych. Został stworzony dataframe, w którym pojedyncza obserwacja dotyczy jednego wskaźnika w danym roku i miejscu. Nie uzupełniano brakujących wartości w danych, ponieważ mnogość i różnorodność wskaźników nie pozwala, by zrobić to w sposób uniwersalny.
colnames(worldDevelopmentIndicators)## [1] "Country Name" "Country Code" "Series Name" "Series Code"
## [5] "1970 [YR1970]" "1971 [YR1971]" "1972 [YR1972]" "1973 [YR1973]"
## [9] "1974 [YR1974]" "1975 [YR1975]" "1976 [YR1976]" "1977 [YR1977]"
## [13] "1978 [YR1978]" "1979 [YR1979]" "1980 [YR1980]" "1981 [YR1981]"
## [17] "1982 [YR1982]" "1983 [YR1983]" "1984 [YR1984]" "1985 [YR1985]"
## [21] "1986 [YR1986]" "1987 [YR1987]" "1988 [YR1988]" "1989 [YR1989]"
## [25] "1990 [YR1990]" "1991 [YR1991]" "1992 [YR1992]" "1993 [YR1993]"
## [29] "1994 [YR1994]" "1995 [YR1995]" "1996 [YR1996]" "1997 [YR1997]"
## [33] "1998 [YR1998]" "1999 [YR1999]" "2000 [YR2000]" "2001 [YR2001]"
## [37] "2002 [YR2002]" "2003 [YR2003]" "2004 [YR2004]" "2005 [YR2005]"
## [41] "2006 [YR2006]" "2007 [YR2007]" "2008 [YR2008]" "2009 [YR2009]"
## [45] "2010 [YR2010]" "2011 [YR2011]" "2012 [YR2012]" "2013 [YR2013]"
## [49] "2014 [YR2014]" "2015 [YR2015]" "2016 [YR2016]" "2017 [YR2017]"
## [53] "2018 [YR2018]" "2019 [YR2019]" "2020 [YR2020]"
wdi <- gather(worldDevelopmentIndicators,key="year", value="developmentIndicators", 5:55) %>%
mutate(year = substr(year,1,4)) %>%
filter(developmentIndicators!="..") %>%
mutate_at("developmentIndicators", as.numeric) %>%
mutate_at("year", as.numeric) %>%
rename(countryCode="Country Code") %>%
rename(indicator="Series Code") %>%
rename(seriesName="Series Name")
wdi_tmp <-wdi %>% filter(countryCode %in% c("DEU","USA","GBR","JPN","RUS","IDN","POL","WLD","CHN"))
summary(wdi_tmp)## Country Name countryCode seriesName indicator
## Length:59534 Length:59534 Length:59534 Length:59534
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## year developmentIndicators
## Min. :1970 Min. :-4.813e+14
## 1st Qu.:1987 1st Qu.: 8.000e+00
## Median :2000 Median : 4.100e+01
## Mean :1998 Mean : 2.806e+12
## 3rd Qu.:2010 3rd Qu.: 4.643e+05
## Max. :2020 Max. : 7.614e+15
z <- translate%>%select("Indicator Name")
paged_table(z, options = list(cols.print = 10,cols.min.print=1))W tej sekcji wczytano dane dotyczące bitcoina. Zbiór nie zawierał pustych wartości. Wartym odnotowania jest fakt, że zanotowane ceny w pewnych momentach wynoszą 0 dolarów.
bchain_metadata %>%
filter(code %in% c("MKPRU")) %>%
select(code, name)## code name
## 1 MKPRU Bitcoin Market Price USD
summary(bchain_mkpru)## Date Value
## Length:4661 Min. : 0.0
## Class :character 1st Qu.: 7.2
## Mode :character Median : 431.9
## Mean : 5141.2
## 3rd Qu.: 6499.1
## Max. :63554.4
bchain_mkpru<- bchain_mkpru %>%
mutate(Date=as.Date(Date,format="%Y-%m-%d"))%>%
filter(Value!=0)
gg <- ggplot(data=bchain_mkpru, aes(x=Date,y=Value)) + geom_line()
ggplotly(gg)W tym rozdziale badać będę korelację między cenami złota i kryptowaluty. Na poniższym wykresie można zobaczyć zależność ceny drogocennego metalu oraz Bitcoina. Jeżeli wartości byłyby w silnej korelacji, punkty na wykresie znajdowałyby się na przekątnych wykresu. Można zobaczyć, że dopiero od 2017 roku warto badać tę zależność. Niestety korelacja w tych latach nie jest znacząca, najwyższą przypada na rok 2019 i wynosi około 0,7. W pozostałych latach ciężko odnaleźć zależność.
df <- bchain_mkpru %>% left_join(gp,c("Date"="g_date")) %>%
select(Date, Value, g_usd)%>%
filter(!is.na(Value) & !is.na(g_usd))
df2 <- df%>%
mutate(month = format(Date, "%m"), year = format(Date, "%Y")) %>%
group_by(month, year) %>%
summarise_at(c("g_usd","Value"),mean, na.rm = TRUE) %>%
rename(avgGold=g_usd,avgBit=Value)%>%
filter(avgGold!=0 & avgBit!=0)%>%
mutate(date = make_date(year=year, month=month))
gg <- ggplot(df2, aes(x=avgGold, y=avgBit,frame=year))+ geom_point()
ggplotly(gg)coeff <- 40
goldColor <-"green"
bitcoinColor<-"red"
ggplot(df, aes(x=Date))+
geom_line(aes(y=g_usd), color=goldColor) +
geom_line(aes(y=Value/coeff), color=bitcoinColor) +
scale_y_continuous(
name = "cena złota",
sec.axis = sec_axis( trans=~.*coeff,name="cena bitcoina")
) +
theme(
axis.title.y = element_text(color = goldColor, size=13),
axis.title.y.right = element_text(color = bitcoinColor, size=13)
)+
xlim(as.Date("2017-01-01",format="%Y-%m-%d"),as.Date("2021-09-29",format="%Y-%m-%d"))df1 <- gp %>% select(g_usd,g_date) %>% rename(Date=g_date)
df2 <- df1%>% inner_join(bchain_mkpru)%>%
group_by(year =year(Date)) %>%
summarize(corel=cor(g_usd,Value))
ggplot(data=df2, aes(x=as.character(year), y=corel)) +
xlab("year")+
ylab("correlation")+
geom_bar(stat="identity", width=0.2)W tej sekcji badano korelację ceny złota pomiędzy kursami walut. Poniżej znajduje się tabelka z wynikami wszystkich walut. Warto zauważyć, że nie można było wyznaczyć korelacji z walutami: Bahrain.Dinar, Qatar.Riyal, Rial.Omani oraz U.S..Dollar. Jest to spowodowane tym, że ich wartości każdego pomiaru są jednakowe.
gp_tmp <- gp %>% select(g_date, g_usd) %>% rename(Date=g_date, Value=g_usd)
currency <- unlist(unique(cer[c("currency")]))
experiment <- data.frame(indicator=c(),corelation=c())
for(i in currency){
tmp <- cer%>%filter(currency==i)%>%
inner_join(gp_tmp)%>%drop_na(value,Value)
corelation <- cor(tmp[c("value")],tmp[c("Value")])
tmp<-data.frame(i,corelation)
colnames(tmp)<-c("currency","corelation")
rownames(tmp) <- NULL
experiment<- rbind(experiment,tmp)
}
e<-experiment %>% arrange(desc(corelation))
prettyTable(e)W tym eksperymencie badana była korelacja między cenami złota oraz wskaźnikami światowymi. Wymagało to obliczenia średniej ceny złota dla poszczególnych lat ponieważ wskaźniki rejestrowane były dla poszczególnych lat. Postanowiłem również nie skupiać się na konkretnym kraju tylko na całości pomiarów. W praktyce oznaczało to wykorzystanie danych globalnych dla całego świata.
gpTmp <-gp %>%
mutate(year = format(g_date, "%Y")) %>%
group_by(year) %>%
summarise_at(vars(g_usd),list(avg = mean))%>%
select(year,avg)%>%
mutate_at("year", as.numeric)
wdiTmp <- wdi %>%
filter(countryCode =="WLD")%>%
select(year,developmentIndicators, seriesName ,indicator)
factor<- unlist(unique(wdiTmp[c("indicator")]))
experiment <- data.frame(indicator=c(),corelation=c())
for(i in factor){
wdiInd <- wdiTmp%>% filter(indicator ==i) %>%
inner_join(gpTmp,by="year")
corelation <- cor(wdiInd[c("developmentIndicators")],wdiInd[c("avg")])
tmp<-data.frame(i,corelation)
colnames(tmp)<-c("indicator","corelation")
rownames(tmp) <- NULL
experiment<- rbind(experiment,tmp)
}result1_experiment <- experiment %>% filter(corelation>0.9)
result1_experiment$description<-mapply(translateIndicator, result1_experiment$indicator)
prettyTable(result1_experiment %>% select(description, corelation))Powyższa tabela prezentuje 19 różnych wskaźników, które mają wysoki (powyżej 0.9) współczynnik korelacji z ceną złota.
result2_experiment <- experiment %>% filter(corelation< (-0.9))
result2_experiment$description<-mapply(translateIndicator, result2_experiment$indicator)
prettyTable(result2_experiment %>% select(description, corelation))Powyższa tabela prezentuje 11 różnych wskaźników, które mają wysoki (poniżej -0.9) współczynnik korelacji z ceną złota.
Poniższa tabela prezentuje zależności pomiędzy cenami złota oraz cenami spółki.
df1 <- gp %>%
select(g_date,g_usd) %>%
mutate(month = format(g_date, "%m"), year = format(g_date, "%Y"))%>%
group_by(month, year) %>%
mutate(g_usd = na.aggregate(g_usd, FUN = mean,na.rm=TRUE))%>%
mutate(Year = make_date(month=month,year=year))%>%
select(Year,g_usd)
df2 <- spComposite %>%
mutate(Year = make_date(month=month,year=year))
df3 <- df2 %>%
inner_join(df1)%>%
mutate(month = format(Year, "%m"), year = format(Year, "%Y"))
x<-cor(x=df3$g_usd, y=df3[!names(df3) %in% c("Year","g_usd","month","year")], use = "complete.obs" )
x <- as.data.frame(t(x))%>% rename(corelation=V1) %>% arrange(desc(corelation))
x<-rownames_to_column(x, "NAME")
prettyTable(x)Poniższa tabela prezentuje zależności pomiędzy cenami bitcoina oraz cenami spółki.
df1 <- bchain_mkpru %>%
mutate(month = format(Date, "%m"), year = format(Date, "%Y"))%>%
group_by(month, year) %>%
mutate(Value = na.aggregate(Value, FUN = mean,na.rm=TRUE))%>%
mutate(Year = make_date(month=month,year=year))%>%
select(Year,Value)%>%select(-c("month","year"))
df2 <- spComposite %>% mutate(Year = make_date(month=month,year=year))%>%select(-c("month","year"))
df3 <- df2 %>% inner_join(df1)%>%select(-c("month","year"))
x <- cor(x=df3$Value, y=df3[!names(df3) %in% c("Year","Value")], use = "complete.obs" )
x <- as.data.frame(t(x))%>% rename(corelation=V1) %>% arrange(desc(corelation))
x<-rownames_to_column(x, "NAME")
prettyTable(x)Poniższa tabela prezentuje zależności pomiędzy cenami bitcoina oraz kursami walut.
bp <- bchain_mkpru
currency <- unlist(unique(cer[c("currency")]))
experiment <- data.frame(indicator=c(),corelation=c())
for(i in currency){
tmp <- cer%>%filter(currency==i)%>%
inner_join(bp)%>%drop_na(value,Value)
corelation <- cor(tmp[c("value")],tmp[c("Value")])
tmp<-data.frame(i,corelation)
colnames(tmp)<-c("currency","corelation")
rownames(tmp) <- NULL
experiment<- rbind(experiment,tmp)
}
e<-experiment %>% arrange(desc(corelation))
prettyTable(e)Poniższa tabela prezentuje zależności pomiędzy cenami bitcoina oraz wskaźnikami światowego rozwoju.
bp <- bchain_mkpru
df2 <- bchain_mkpru%>%
mutate(year = format(Date, "%Y")) %>%
group_by(year) %>%
summarise(avgBit= mean(Value)) %>%
transform(year = as.numeric(year))
wdiTmp <- wdi %>%
filter(countryCode =="WLD")%>%
select(year,developmentIndicators, seriesName ,indicator)
factor<- unlist(unique(wdiTmp[c("indicator")]))
experiment <- data.frame(indicator=c(),corelation=c())
for(i in factor){
wdiInd <- wdiTmp%>% filter(indicator ==i) %>%
inner_join(df2,by="year")
corelation <- cor(wdiInd[c("developmentIndicators")],wdiInd[c("avgBit")])
tmp<-data.frame(i,corelation)
colnames(tmp)<-c("indicator","corelation")
rownames(tmp) <- NULL
experiment<- rbind(experiment,tmp)
}
result3_experiment <- experiment %>% filter(corelation>0.9)
result3_experiment$description<-mapply(translateIndicator, result3_experiment$indicator)
prettyTable(result3_experiment %>% select(description, corelation))W tej części skupiono się na przewidywaniu cen złota, ponieważ wartości tego zbiór posiadały większą korelację ze zbiorem kursów walut w porównaniu do cen kryptowaluty. W tym celu wykorzystano poniższe
wskaźniki światowe: GDP (current US$)
df_wld <- wdi %>%
filter(countryCode=="WLD" & indicator=="NY.GDP.MKTP.CD") %>%
rename(GPDpc=developmentIndicators)%>%
select(GPDpc, year)
gg <- ggplot(data=df_wld, aes(x=year,y=GPDpc)) +
geom_line()+
ggtitle("GPD")
ggplotly(gg)Poniższy macierz przedstawia korelację wszystkich dostępnych wartości ze zbioru indeksów giełdowych. W celu uniknięcia wykorzystywania nadmiarowej ilości danych nie wykorzystywano atrybutów, które w poniższej macierzy na przecięciu mają korelację równą 1. W związku, z czym wykorzystano tylko: Dividend, CPI oraz Real.Earnings.
tmpdf <- spComposite %>% select(-c(month,year))
corr <- round(cor(tmpdf), 1)
ggcorrplot(corr, type = "lower", lab = TRUE)Wizualizacja wybranych atrybutów indeksów giełdowych.
df_stonks <- spComposite %>%
select(year, month, Dividend, CPI, Real.Earnings)%>%
mutate(year=as.integer(year), month=as.integer(month))
gg <- ggplot(data=df_stonks, aes(x=year,y=Dividend)) +
geom_line()+
ggtitle("Dividend")
ggplotly(gg)gg <- ggplot(data=df_stonks, aes(x=year,y=CPI)) +
geom_line()+
ggtitle("CPI")
ggplotly(gg)gg <- ggplot(data=df_stonks, aes(x=year,y=Real.Earnings)) +
geom_line()+
ggtitle("Real.Earnings")
ggplotly(gg)Wizualizacja dwóch wybranych walut, które mają wysoki wskaźnik korelacji.
df_cur_Australian.Dollar <- cer %>% filter(currency %in% c("Australian.Dollar"))%>%
rename(Australian.Dollar=value) %>% select(Date, Australian.Dollar)
df_cur_Brunei.Dollar <- cer %>% filter(currency %in% c("Brunei.Dollar"))%>%
rename(Brunei.Dollar=value) %>% select(Date, Brunei.Dollar)
df_cur <- merge(df_cur_Australian.Dollar, df_cur_Brunei.Dollar, by="Date")
gg <- ggplot(data=df_cur, aes(Date)) +
geom_line(aes(y = Brunei.Dollar, colour = "Brunei.Dollar"))+
geom_line(aes(y = Australian.Dollar, colour = "Australian.Dollar"))+
ggtitle("Waluty")+
ylab("Value")
ggplotly(gg)W celu uzyskania tylko rekordów, które mają wszystkie dane zdecydowano się na łączenia typu inner join. Wiąże się to z wybraniem danych z lat 1998-2018, ponieważ właśnie z tych lat posiadamy dane odnośnie walut.
df_gold <- gp %>%
select(g_date,g_usd) %>% rename(Date=g_date)
all_ <- df_gold %>% inner_join((df_cur)) %>%
mutate(month =as.integer(format(Date, "%m")), year =as.integer( format(Date, "%Y")))%>%
inner_join(df_stonks, by = c("year" = "year", "month" = "month"))%>%
inner_join(df_wld, by=c("year"="year")) %>%select(-c(year, month))
summary(all_)## Date g_usd Australian.Dollar Brunei.Dollar
## Min. :1998-09-02 Min. : 252.9 Min. :0.4833 Min. :1.000
## 1st Qu.:2003-07-21 1st Qu.: 363.6 1st Qu.:0.6579 1st Qu.:1.347
## Median :2008-05-21 Median : 855.6 Median :0.7633 Median :1.464
## Mean :2008-06-11 Mean : 849.7 Mean :0.7741 Mean :1.507
## 3rd Qu.:2013-05-09 3rd Qu.:1260.0 3rd Qu.:0.8954 3rd Qu.:1.698
## Max. :2018-04-30 Max. :1893.0 Max. :1.1055 Max. :1.850
## Dividend CPI Real.Earnings GPDpc
## Min. :15.69 Min. :163.6 Min. : 8.805 Min. :3.140e+13
## 1st Qu.:16.74 1st Qu.:184.2 1st Qu.: 65.935 1st Qu.:3.895e+13
## Median :24.10 Median :212.2 Median : 89.879 Median :6.044e+13
## Mean :26.71 Mean :208.5 Mean : 84.207 Mean :5.786e+13
## 3rd Qu.:32.88 3rd Qu.:232.9 3rd Qu.:105.320 3rd Qu.:7.523e+13
## Max. :50.33 Max. :250.5 Max. :128.344 Max. :8.634e+13
all_together <- all_ %>% select(-c(Date))Sumarycznie powstało 4514 rekordów.
Jako model decyzyny wykorzystano drzewo warunkowego wnioskowania.
set.seed(9)
inTraining <-
createDataPartition(
y = all_together$g_usd,
p = .75,
list = FALSE)
training <- all_together[ inTraining,]
testing <- all_together[-inTraining,]
fitControl <- trainControl(method = "repeatedcv",
number = 10,
repeats = 10)
model <- train(g_usd ~ .,
data = training,
method = "ctree", # ctree>lm
trControl = fitControl)
model## Conditional Inference Tree
##
## 3386 samples
## 6 predictor
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 3047, 3047, 3047, 3048, 3048, 3048, ...
## Resampling results across tuning parameters:
##
## mincriterion RMSE Rsquared MAE
## 0.01 28.55823 0.9958482 15.12712
## 0.50 30.46865 0.9953961 16.54158
## 0.99 37.63105 0.9934008 20.58518
##
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mincriterion = 0.01.
predictions <- predict(model, testing)
postResample(pred = predictions, obs = testing$g_usd)## RMSE Rsquared MAE
## 22.6442503 0.9977324 13.5416709
tmp<- testing
tmp$pred<-predictions
tmp<-tmp%>%select(g_usd,pred)
head(tmp)## # A tibble: 6 x 2
## g_usd pred
## <dbl> <dbl>
## 1 1321. 1319.
## 2 1348. 1342.
## 3 1347. 1325.
## 4 1344. 1342.
## 5 1337. 1342.
## 6 1346. 1342.
Powyższy fragment kodu przedstawia faktyczne wartości oraz przykładowe predykcje.